In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score, r2_score, mean_absolute_error

In [52]:
path = '../input/melbourne-housing-snapshot/melb_data.csv'

In [63]:
melbourne_data = pd.read_csv(path)

In [54]:
melbourne_data.describe()


Out[54]:
Rooms Price Distance Postcode Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt Lattitude Longtitude Propertycount
count 13580.000000 1.358000e+04 13580.000000 13580.000000 13580.000000 13580.000000 13518.000000 13580.000000 7130.000000 8205.000000 13580.000000 13580.000000 13580.000000
mean 2.937997 1.075684e+06 10.137776 3105.301915 2.914728 1.534242 1.610075 558.416127 151.967650 1964.684217 -37.809203 144.995216 7454.417378
std 0.955748 6.393107e+05 5.868725 90.676964 0.965921 0.691712 0.962634 3990.669241 541.014538 37.273762 0.079260 0.103916 4378.581772
min 1.000000 8.500000e+04 0.000000 3000.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1196.000000 -38.182550 144.431810 249.000000
25% 2.000000 6.500000e+05 6.100000 3044.000000 2.000000 1.000000 1.000000 177.000000 93.000000 1940.000000 -37.856822 144.929600 4380.000000
50% 3.000000 9.030000e+05 9.200000 3084.000000 3.000000 1.000000 2.000000 440.000000 126.000000 1970.000000 -37.802355 145.000100 6555.000000
75% 3.000000 1.330000e+06 13.000000 3148.000000 3.000000 2.000000 2.000000 651.000000 174.000000 1999.000000 -37.756400 145.058305 10331.000000
max 10.000000 9.000000e+06 48.100000 3977.000000 20.000000 8.000000 10.000000 433014.000000 44515.000000 2018.000000 -37.408530 145.526350 21650.000000

In [64]:
melbourne_data.columns


Out[64]:
Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [65]:
melbourne_data = melbourne_data.dropna(axis=0)  # Just drop empty values

In [66]:
y = melbourne_data.Price

In [67]:
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude', 'BuildingArea']
X = melbourne_data[melbourne_features]
X.describe()
y = melbourne_data['Price']

In [68]:
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt',]
X = melbourne_data[melbourne_features]
# X = pd.get_dummies(X)
y = melbourne_data['Price']
X.describe()


Out[68]:
Rooms Bathroom Landsize BuildingArea YearBuilt
count 6196.000000 6196.000000 6196.000000 6196.000000 6196.000000
mean 2.931407 1.576340 471.006940 141.568645 1964.081988
std 0.971079 0.711362 897.449881 90.834824 38.105673
min 1.000000 1.000000 0.000000 0.000000 1196.000000
25% 2.000000 1.000000 152.000000 91.000000 1940.000000
50% 3.000000 1.000000 373.000000 124.000000 1970.000000
75% 4.000000 2.000000 628.000000 170.000000 2000.000000
max 8.000000 8.000000 37000.000000 3112.000000 2018.000000

In [69]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [70]:
model = RandomForestRegressor(random_state=1, n_estimators=100)

In [71]:
model.fit(train_X, train_y)


Out[71]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [72]:
tree_model = DecisionTreeRegressor(random_state=1)
tree_model.fit(train_X, train_y)


Out[72]:
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

In [73]:
random_forest_val_mae = mean_absolute_error(model.predict(val_X), val_y)
tree_val_mae = mean_absolute_error(tree_model.predict(val_X), val_y)
print(random_forest_val_mae, tree_val_mae)
print(explained_variance_score(model.predict(val_X), val_y), explained_variance_score(tree_model.predict(val_X), val_y))
print(r2_score(model.predict(val_X), val_y), r2_score(tree_model.predict(val_X), val_y))


267603.5842811307 356445.63324008323
0.2835345265623028 0.13786117875076964
0.2832649586981667 0.13738389008729823

In [75]:
columns = X.columns
my_house = pd.DataFrame([{'Rooms': 2, 'Bathroom': 1, 'Landsize': 700, 'BuildingArea': 150, 'YearBuilt': 1990}, ], columns=columns)

In [78]:
model.predict([[2, 1, 700, 150, 1990], ])


Out[78]:
array([951210.])

XGBoost


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
path = '../input/melbourne-housing-snapshot/melb_data.csv'
data = pd.read_csv(path)
data.dropna(axis=0, subset=['Price'], inplace=True)
y = data.Price
X = data.drop(['Price'], axis=1).select_dtypes(exclude=['object'])
train_X, test_X, train_y, test_y = train_test_split(X.as_matrix(), y.as_matrix(), test_size=0.25)

# Imputer, should probably ignore this
my_imputer = Imputer()
train_X = my_imputer.fit_transform(train_X)
test_X = my_imputer.transform(test_X)

X.describe()


/home/pyr0/.virtualenvs/ml-notebook-to-prod/lib/python3.6/site-packages/ipykernel_launcher.py:8: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
  
/home/pyr0/.virtualenvs/ml-notebook-to-prod/lib/python3.6/site-packages/sklearn/utils/deprecation.py:58: DeprecationWarning: Class Imputer is deprecated; Imputer was deprecated in version 0.20 and will be removed in 0.22. Import impute.SimpleImputer from sklearn instead.
  warnings.warn(msg, category=DeprecationWarning)
Out[15]:
Rooms Distance Postcode Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt Lattitude Longtitude Propertycount
count 13580.000000 13580.000000 13580.000000 13580.000000 13580.000000 13518.000000 13580.000000 7130.000000 8205.000000 13580.000000 13580.000000 13580.000000
mean 2.937997 10.137776 3105.301915 2.914728 1.534242 1.610075 558.416127 151.967650 1964.684217 -37.809203 144.995216 7454.417378
std 0.955748 5.868725 90.676964 0.965921 0.691712 0.962634 3990.669241 541.014538 37.273762 0.079260 0.103916 4378.581772
min 1.000000 0.000000 3000.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1196.000000 -38.182550 144.431810 249.000000
25% 2.000000 6.100000 3044.000000 2.000000 1.000000 1.000000 177.000000 93.000000 1940.000000 -37.856822 144.929600 4380.000000
50% 3.000000 9.200000 3084.000000 3.000000 1.000000 2.000000 440.000000 126.000000 1970.000000 -37.802355 145.000100 6555.000000
75% 3.000000 13.000000 3148.000000 3.000000 2.000000 2.000000 651.000000 174.000000 1999.000000 -37.756400 145.058305 10331.000000
max 10.000000 48.100000 3977.000000 20.000000 8.000000 10.000000 433014.000000 44515.000000 2018.000000 -37.408530 145.526350 21650.000000

In [16]:
from xgboost import XGBRegressor

my_model = XGBRegressor()
# Add silent=True to avoid printing out updates with each cycle
my_model.fit(train_X, train_y, verbose=False)


Out[16]:
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [17]:
# make predictions
predictions = my_model.predict(test_X)

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))
print("Explained Variance Score :" + str(explained_variance_score(predictions, test_y)))
print("R2 Score :" + str(r2_score(predictions, test_y)))


Mean Absolute Error : 207814.7592783505
Explained Variance Score :0.5522026409838039
R2 Score :0.5521476993058099

In [18]:
my_model = XGBRegressor(n_estimators=1000)
my_model.fit(train_X, train_y, early_stopping_rounds=5, 
             eval_set=[(test_X, test_y)], verbose=False)


Out[18]:
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [19]:
print("Mean Absolute Error : " + str(mean_absolute_error(my_model.predict(test_X), test_y)))
print("Explained Variance Score :" + str(explained_variance_score(my_model.predict(test_X), test_y)))
print("R2 Score :" + str(r2_score(my_model.predict(test_X), test_y)))


Mean Absolute Error : 192326.96400727172
Explained Variance Score :0.6560821739618254
R2 Score :0.656073140740969

In [20]:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(train_X, train_y, early_stopping_rounds=5, 
             eval_set=[(test_X, test_y)], verbose=False)
# make predictions
predictions = my_model.predict(test_X)

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))
print("Explained Variance Score :" + str(explained_variance_score(predictions, test_y)))
print("R2 Score :" + str(r2_score(predictions, test_y)))


Mean Absolute Error : 194934.63486284978
Explained Variance Score :0.6365735060800535
R2 Score :0.6365503812225121

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
path = '../input/melbourne-housing-snapshot/melb_data.csv'
data = pd.read_csv(path)
data.dropna(axis=0, subset=['Price'], inplace=True)
y = data.Price
X = data.drop(['Price'], axis=1).select_dtypes(exclude=['object'])
# melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt',]
# X = X[melbourne_features]
columns = X.columns

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.20, random_state=1)

# Imputer, should probably ignore this
# my_imputer = Imputer()
# train_X = my_imputer.fit_transform(train_X)
# test_X = my_imputer.transform(test_X)

X.describe()


Out[21]:
Rooms Distance Postcode Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt Lattitude Longtitude Propertycount
count 13580.000000 13580.000000 13580.000000 13580.000000 13580.000000 13518.000000 13580.000000 7130.000000 8205.000000 13580.000000 13580.000000 13580.000000
mean 2.937997 10.137776 3105.301915 2.914728 1.534242 1.610075 558.416127 151.967650 1964.684217 -37.809203 144.995216 7454.417378
std 0.955748 5.868725 90.676964 0.965921 0.691712 0.962634 3990.669241 541.014538 37.273762 0.079260 0.103916 4378.581772
min 1.000000 0.000000 3000.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1196.000000 -38.182550 144.431810 249.000000
25% 2.000000 6.100000 3044.000000 2.000000 1.000000 1.000000 177.000000 93.000000 1940.000000 -37.856822 144.929600 4380.000000
50% 3.000000 9.200000 3084.000000 3.000000 1.000000 2.000000 440.000000 126.000000 1970.000000 -37.802355 145.000100 6555.000000
75% 3.000000 13.000000 3148.000000 3.000000 2.000000 2.000000 651.000000 174.000000 1999.000000 -37.756400 145.058305 10331.000000
max 10.000000 48.100000 3977.000000 20.000000 8.000000 10.000000 433014.000000 44515.000000 2018.000000 -37.408530 145.526350 21650.000000

In [22]:
# Test again but only with the features we can expect from users
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(train_X, train_y, early_stopping_rounds=5, 
             eval_set=[(test_X, test_y)], verbose=False)
# make predictions
predictions = my_model.predict(test_X)

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))
print("Explained Variance Score :" + str(explained_variance_score(predictions, test_y)))
print("R2 Score :" + str(r2_score(predictions, test_y)))


Mean Absolute Error : 189337.55038717322
Explained Variance Score :0.7119030500659722
R2 Score :0.7118993480823608

In [23]:
my_house = pd.DataFrame([{'Rooms': 2, 'Bathroom': 1, 'Landsize': 700, 'BuildingArea': 150, 'YearBuilt': 1990}, ], columns=columns)

In [24]:
my_house


Out[24]:
Rooms Distance Postcode Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt Lattitude Longtitude Propertycount
0 2 NaN NaN NaN 1 NaN 700 150 1990 NaN NaN NaN

In [25]:
# my_house = my_imputer.transform(my_house)

In [26]:
my_house


Out[26]:
Rooms Distance Postcode Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt Lattitude Longtitude Propertycount
0 2 NaN NaN NaN 1 NaN 700 150 1990 NaN NaN NaN

In [27]:
my_model.predict(my_house)


Out[27]:
array([1135836.4], dtype=float32)

In [28]:
z = {'model': my_model, 'impu': my_imputer}

In [29]:
import pickle

In [30]:
s = pickle.dumps(z)

In [31]:
clf2 = pickle.loads(s)

In [32]:
clf2['model'].predict(my_house)


Out[32]:
array([1135836.4], dtype=float32)

model.predict

Note

This function is not thread safe.

For each booster object, predict can only be called from one thread. If you want to run prediction using multiple thread, call bst.copy() to make copies of model object and then call predict().


In [75]:
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
xgb_model = XGBRegressor()
clf = GridSearchCV(
    xgb_model,
    {'max_depth': [6,],
     'learning_rate': [0.05,],
     'n_estimators': [450, 470, 475, 480, 485]},
    n_jobs=4,
    cv=3,
    verbose=2
)
clf.fit(train_X, train_y) # None, eval_set=[(test_X, test_y)], verbose=False)
print(clf.best_score_)
print(clf.best_params_)

predictions = clf.predict(test_X)

print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))
print("Explained Variance Score :" + str(explained_variance_score(predictions, test_y)))
print("R2 Score :" + str(r2_score(predictions, test_y)))


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:   16.6s finished
0.7861661243722181
{'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 480}
Mean Absolute Error : 164896.49655398563
Explained Variance Score :0.7839552604329917
R2 Score :0.7838708909683693

In [68]:
print(type(clf))
print(type(clf.best_estimator_))
print(clf.best_estimator_.predict(my_house))
clf.predict(my_house)


<class 'sklearn.model_selection._search.GridSearchCV'>
<class 'xgboost.sklearn.XGBRegressor'>
[1686340.5]
Out[68]:
array([1686340.5], dtype=float32)

In [120]:
# print(type(clf))
# print(type(clf.best_estimator_))
# print(clf.best_estimator_.predict(my_house))
# clf.predict(my_house)
import numpy as np
print(columns)
print([col for col in columns])
features = {col: np.NaN for col in columns}
my_house = {'Rooms': 2, 'Bathroom': 1, 'Landsize': 700, 'BuildingArea': 150, 'YearBuilt': 1990}
features.update(my_house)
print(features)
pd.DataFrame([features], columns=columns)


Index(['Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude',
       'Propertycount'],
      dtype='object')
['Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']
{'Rooms': 2, 'Distance': nan, 'Postcode': nan, 'Bedroom2': nan, 'Bathroom': 1, 'Car': nan, 'Landsize': 700, 'BuildingArea': 150, 'YearBuilt': 1990, 'Lattitude': nan, 'Longtitude': nan, 'Propertycount': nan}
Out[120]:
Rooms Distance Postcode Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt Lattitude Longtitude Propertycount
0 2 NaN NaN NaN 1 NaN 700 150 1990 NaN NaN NaN

In [105]:
clf.best_estimator_.save_model('./test_model')

In [114]:
some_model = XGBRegressor()
some_model.load_model('./test_model')
some_model.predict(pd.DataFrame([features], columns=columns), validate_features=True)


Out[114]:
array([1667678.1], dtype=float32)

In [35]:
my_model.predict(pd.DataFrame([{'Rooms': 6, 'Bathroom': 2, 'Landsize': 2500, 'BuildingArea': 800, 'YearBuilt': 1980}, ], columns=columns))


Out[35]:
array([3371605.5], dtype=float32)

In [70]:
# What will happen if I drop all the other columns from test_X ?
test_X.head()
reduced_test_X = test_X[['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt']]
reduced_test_X = pd.DataFrame(reduced_test_X, columns=columns)

predictions = clf.predict(reduced_test_X)

print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))
print("Explained Variance Score :" + str(explained_variance_score(predictions, test_y)))
print("R2 Score :" + str(r2_score(predictions, test_y)))


Mean Absolute Error : 1177564.2966909057
Explained Variance Score :0.1458039457147109
R2 Score :-2.23788192694643

In [50]:
# Ok, lets train both models wiht this and re-evaluate

xgb_model = XGBRegressor()
clf = GridSearchCV(xgb_model,
                   {'max_depth': [2,4,6],
                    'n_estimators': [50,100,200]}, verbose=1)
clf.fit(pd.DataFrame(train_X[['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt']], columns=columns), train_y)
print(clf.best_score_)
print(clf.best_params_)

predictions = clf.predict(test_X)

predictions = clf.predict(reduced_test_X)

print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))
print("Explained Variance Score :" + str(explained_variance_score(predictions, test_y)))
print("R2 Score :" + str(r2_score(predictions, test_y)))

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(train_X, train_y, early_stopping_rounds=5, 
             eval_set=[(reduced_test_X, test_y)], verbose=False)
# make predictions
predictions = my_model.predict(reduced_test_X)

print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))
print("Explained Variance Score :" + str(explained_variance_score(predictions, test_y)))
print("R2 Score :" + str(r2_score(predictions, test_y)))


/home/pyr0/.virtualenvs/ml-notebook-to-prod/lib/python3.6/site-packages/sklearn/model_selection/_split.py:1943: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
  warnings.warn(CV_WARNING, FutureWarning)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    8.5s finished
0.485596032269942
{'max_depth': 4, 'n_estimators': 50}
Mean Absolute Error : 311226.66502899484
Explained Variance Score :-0.06182310769657984
R2 Score :-0.06194099983369239
Mean Absolute Error : 343490.41036910895
Explained Variance Score :-0.4369656724788491
R2 Score :-0.43697113705030755